## Data processing for:
##
## Christopher Adolph, Kenya Amano, Bree Bang-Jensen, 
## Nancy Fullman, Beatrice Magistro, Grace Reinke, Rachel Castellano,## Megan Erickson, and John Wilkerson, "The Pandemic Policy U-Turn:## The role of partisanship, public health, and race in decisions
## to ease COVID-19 social distancing policies in the U.S.", 
## forthcoming in Perspectives on Politics.
##
## This file:
## makes the analysis dataset AnalysisData.Rdata
##
## Kenya Amano
## 16 June 2021


# Clear memory
rm(list = ls(all.names = TRUE))


## packages (all available on cran)
UsedPackages <- 
  c("readxl", 
    "readstata13",
    "lubridate",
    "foreign",
    "tidyverse")

# Install packages not yet installed
InstalledPackages <- UsedPackages %in% rownames(installed.packages())
if (any(InstalledPackages == FALSE)) {
  install.packages(UsedPackages[!InstalledPackages])
}

## Load R packages
shh = lapply(UsedPackages, library, character.only=TRUE)



# Set base directory
basedir <- getwd()
rowdata.folder <- paste(basedir, "rawdata",
                        sep = "/")

source("HelperProcessing.R")

###########################################################################################
# Epidemiological Data

# Epidemiological(COVID-19) data from NYTimes, Johns Hopkins Coronavirus Resource Center, 
# and The COVID Tracking Project

# READ COVID DATA
# Original data are from NYTimes and JHU CSSE at county-day level
# and from  Covid Tracking Project
#
# These data were collected by the authors over time from the NYT, JHU, and CTP
# repositories in an effort to match as closely as possible the contemporaneous 
# data available to policy makers, rather than later (possibly corrected)
# versions of the data.  Below are summaries of when the data were collected 
# for different date ranges and sources:
#
# JHU:  
# From the beginning to 22 March 2020: Collected 24 March 2020
# 23 March 2020 to 4 April 2020: Real-time data collected daily
# 4 April 2020 to 31 July 2020: Collected 23 September 2020
#
# NYT and CPT 
# From the beginning to 31 July 2020: Collected 23 September 2020
#
###########################################################################################

covidDFCox0 <- 
  read.csv(paste(rowdata.folder, "COVIDdata.csv", sep="/"))


## Read population data (number)
df.pop0 <- 
  read.csv(paste(rowdata.folder, "gbd2017_statepopest_2017.csv", sep="/"),
           header=TRUE, na.strings=".", stringsAsFactors = FALSE, as.is=T) %>% 
  filter(sex_name == "Both") %>% 
  select(state_name, age_group_name, val) %>% 
  rename(state = state_name,
         group = age_group_name) %>% 
  mutate(group = map_chr(group, age_bracket_fun)) %>%   # Change variable name by function
  filter(group != "delete")


# Total population only
df.pop <- 
  df.pop0 %>% 
  filter(group == "Pop") %>% 
  rename(Pop = val) %>% 
  select(-group) %>% 
  mutate(PopMillion = Pop/1000000)


# Create additional variables
covidDFCox <-     
covidDFCox0 %>% 
  
  group_by(state_name, COVIDSource) %>% 
  
  left_join(df.pop %>% rename(state_name = state) %>% select(-Pop),
            by = "state_name") %>% 
  
  mutate(ConfirmedCasesChange = ConfirmedCases - lag(ConfirmedCases),
         DeathsChange  = Deaths - lag(Deaths),
         
         NewConfirmedCasesPerM = ConfirmedCasesChange/PopMillion,
         NewDeathsPerM = DeathsChange/PopMillion,
         
         NewConfirmedCasesPerM7DaysMAVG =  Create7DaysMAVG(NewConfirmedCasesPerM),
         NewDeathsPerM7DaysMAVG         =  Create7DaysMAVG(NewDeathsPerM)
         
         ) %>% 
  select(-PopMillion) %>% 
  ungroup()



#####################################################################################
# Create trend variables
# Using function from HelperProcessing.R

###########################################################
## Caution!! This takes long 
###########################################################

covidDFCoxSlopeDF <- 
  bind_rows(
    
    # For NYT data
    tibble(Model = "LM",
           vars =  c("NewConfirmedCasesPerM",
                     "TestPositivityRate")) %>% 
      mutate(modelDF = list(covidDFCox %>% filter(COVIDSource == "NYT")),
             COVIDSource = "NYT") %>% 
      
      mutate(SlopeDF = pmap(list(Model = Model,
                                 vars = vars,
                                 DF = modelDF),
                            CreateSlopeDF))%>% 
      mutate(VarName = paste0(vars, "LagSlope", Model)) %>% 
      select(SlopeDF, VarName, COVIDSource) %>% 
      unnest(SlopeDF) %>% 
      pivot_wider(names_from = VarName, values_from = LagSlope),
    
    # For JHU data
    tibble(Model = "LM",
           vars =  c("NewConfirmedCasesPerM",
                     "TestPositivityRate")) %>%
      mutate(modelDF = list(covidDFCox %>% filter(COVIDSource == "JHU")),
             COVIDSource = "JHU") %>% 
      
      mutate(SlopeDF = pmap(list(Model = Model,
                                 vars = vars,
                                 DF = modelDF),
                            CreateSlopeDF))%>% 
      mutate(VarName = paste0(vars, "LagSlope", Model)) %>% 
      select(SlopeDF, VarName, COVIDSource) %>% 
      unnest(SlopeDF) %>% 
      pivot_wider(names_from = VarName, values_from = LagSlope),
    
    # For CTP data
    tibble(Model = "LM",
           vars =  c("NewConfirmedCasesPerM",
                     "TestPositivityRate")) %>%
      mutate(modelDF = list(covidDFCox %>% filter(COVIDSource == "CTP")),
             COVIDSource = "CTP") %>% 
      
      
      mutate(SlopeDF = pmap(list(Model = Model,
                                 vars = vars,
                                 DF = modelDF),
                            CreateSlopeDF))%>% 
      mutate(VarName = paste0(vars, "LagSlope", Model)) %>% 
      select(SlopeDF, VarName, COVIDSource) %>% 
      unnest(SlopeDF) %>% 
      pivot_wider(names_from = VarName, values_from = LagSlope)
  )


covidDFCox <- 
  covidDFCox %>% 
  left_join(covidDFCoxSlopeDF,
            by = c("state_name", "date_num", "COVIDSource"))

#####################################################################################
# Create diffusion variables

neighborData <- 
  read.csv(paste(rowdata.folder,"Diffusion Neighboring States.csv", sep="/"),
           stringsAsFactors=FALSE)

lStates <- state.name

neighbors <- list()
for (i in 1:length(lStates)) {
  curNeigh <- as.character(neighborData[neighborData$state==lStates[i], 2:8])
  curNeigh <- curNeigh[curNeigh!=""]
  neighbors[[lStates[i]]] <- curNeigh
}

covidDFCox <- 
  covidDFCox %>% 
  mutate(DaysSinceFCT = difftime(ymd(date_num), ymd("20200226"), units="days") %>% 
                        as.numeric())


  covidDFCox$MeanNeighNewDeathsPerM7DaysMAVG <- 
  rep(0, nrow(covidDFCox))
  
  
###########################################################
## Caution!! This takes long 
###########################################################
  
##  Geographic diffusion 
##  loop and insert the variable 

for (i in 1:nrow(covidDFCox)) {
  curState <- covidDFCox$state_name[i]
  curDay <- covidDFCox$DaysSinceFCT[i]
  curSenders <- neighbors[[curState]]
  if (length(curSenders)>0) {
    getRows <- (covidDFCox$state_name %in% curSenders)&(covidDFCox$DaysSinceFCT==curDay)
    
    covidDFCox$MeanNeighNewDeathsPerM7DaysMAVG[i] <- 
      mean(covidDFCox$NewDeathsPerM7DaysMAVG[getRows])
  }
}

################################################################
## Finalize data frame and select required variables
covidDFCox <- 
  covidDFCox %>% 
  rename(state = state_name) %>%
  
  select(state, date_num, COVIDSource,
         NewDeathsPerM7DaysMAVG,
         NewConfirmedCasesPerMLagSlopeLM,
         TestPositivityRateLagSlopeLM,
         MeanNeighNewDeathsPerM7DaysMAVG) %>% 
  
  
  ## Replace negative death and case counts with 0
  mutate_at(vars(NewDeathsPerM7DaysMAVG,
                 MeanNeighNewDeathsPerM7DaysMAVG), 
              list(~ case_when(. < 0 ~ 0,
                               TRUE   ~ .))) %>%
  mutate(origNewDeathsPerM7DaysMAVG  = NewDeathsPerM7DaysMAVG) %>% 
    
  ## Dummy out zero death and case count
  mutate(NoDeaths7DaysMAVG = 
           if_else(NewDeathsPerM7DaysMAVG == 0, 1, 0),
         NoNeighNewDeaths7DaysMAVG =
             if_else(MeanNeighNewDeathsPerM7DaysMAVG == 0, 1, 0)) %>% 
    
  ## Replace 0 counts with 1 (without loss of generality as long as indicator is included
  ## in model as additional covariate)
  mutate_at(vars(NewDeathsPerM7DaysMAVG, 
                 MeanNeighNewDeathsPerM7DaysMAVG), 
              list(~ case_when(. == 0 ~ 1,
                               TRUE   ~ .))) %>% 
  ## Create logged version of death and case counts
  mutate(logNewDeathsPerM7DaysMAVG = log(NewDeathsPerM7DaysMAVG)) 
    


#######################################################################################################
# State Policy Data


####################################################
# GENERATE DATES 
# Start at 20-Feb-2020 and 31-July-2020
####################################################

date_beg <- '2020-02-20' # start our time period at Feb 20, 2020
date_end <- '2020-07-31' # End July 31, 2020

dateDF <- 
  tibble(date=as.Date(date_beg):as.Date(date_end)) %>%
  mutate(date=as.Date(date,origin="1970-01-01"),
         year=as.numeric(paste0(20,format(date, "%y"))),
         month=as.numeric(format(date, "%m")),
         day=as.numeric(format(date,"%d"))) %>% 
  mutate(date_num = paste0(str_sub(date, start = 1L, end = 4L),
                           str_sub(date, start = 6L, end = 7L),
                           str_sub(date, start = -2L, end = -1L)) %>% 
           as.numeric()) %>% 
  select(-year, -month, -day)

####################################################################
# READ State Policy DATA
# StatePolicy.csv
####################################################################

## Read csv 
StatePolicyTable <- 
  read.csv(paste(rowdata.folder, "StatePolicy.csv", sep="/"))


PolicyList <- 
  StatePolicyTable %>% 
  distinct(StatePolicy)


DateTypeList <- 
  StatePolicyTable %>% 
  distinct(DateType)


## Merge State policy table with date variables
# Setup date, date, and list of policies
StatePolicyDF0 <- 
  merge(tibble(state_name = state.name), 
        dateDF) %>% 
  merge(PolicyList) %>%
  merge(DateTypeList) 


# Left join the policy dates
StatePolicyDF1 <- 
  StatePolicyDF0 %>% 
  left_join(StatePolicyTable %>% rename(state_name = StateName),
            by =c("state_name", "StatePolicy", "DateType")) %>% 
  arrange(state_name, StatePolicy, DateType, date_num) %>% 
  mutate(PolicyDateDummy = 
           case_when(is.na(PolicyDate)==T   ~ 0,
                     date_num==PolicyDate   ~ 1,
                     TRUE ~ 0))


# Create policy dummy variables
StatePolicyDF <- 
  StatePolicyDF1 %>% 
  select(-SubstatePolicyDummy) %>% 
  mutate(PolicyDateDummyAfter =  # Take value 1 after implementation
           case_when(is.na(PolicyDate)==T   ~ 0,
                     date_num <  PolicyDate ~ 0,
                     date_num == PolicyDate ~ 1,
                     date_num >  PolicyDate ~ 1),
         
         PolicyDateNAAfter =     # Take value NA after implementation
           case_when(is.na(PolicyDate)==T   ~ 0,
                     date_num <  PolicyDate ~ 0,
                     date_num == PolicyDate ~ 1,
                     date_num >  PolicyDate ~ 99)) %>% 
  mutate_at(vars(PolicyDateNAAfter), funs(na_if(., 99))) %>%  

  
  
  pivot_wider(names_from = DateType, 
              values_from = c(PolicyDate,
                              PolicyDateDummy,
                              PolicyDateDummyAfter,
                              PolicyDateNAAfter)) %>% 
  
  
  # PolicyFirstSubEaseDV: 
  # NA : before DateEnacted
  # 0  : on or after DateEnacted  *and* before DateSub1stEasing 
  # 1  : on DateSub1stEasing
  # NA : after DateSub1stEasing
  mutate(PolicyFirstSubEaseDV =   
           case_when(PolicyDateDummyAfter_DateEnacted ==0 ~ 99,
                     PolicyDateDummyAfter_DateEnacted==1&
                       PolicyDateDummyAfter_DateSub1stEasing==0 ~ 0,
                     PolicyDateDummy_DateSub1stEasing==1 ~ 1,
                     is.na(PolicyDateNAAfter_DateSub1stEasing)==T ~ 99)) %>% 
  mutate_at(vars(PolicyFirstSubEaseDV), funs(na_if(., 99))) %>% 
  

  # PolicySubNonReligIndoorEaseDV:  
  # NA : before DateEnacted
  # 0  : on or after DateEnacted  *and* before DateSub1stEasing 
  # 1  : on DateSub1stEasing
  # NA : after DateSub1stEasing
  mutate(PolicySubNonReligIndoorEaseDV =   
           case_when(PolicyDateDummyAfter_DateEnacted ==0 ~ 99,
                     PolicyDateDummyAfter_DateEnacted==1&
                       PolicyDateDummyAfter_DateSubNonReligIndoorEasing==0 ~ 0,
                     PolicyDateDummy_DateSubNonReligIndoorEasing==1 ~ 1,
                     is.na(PolicyDateNAAfter_DateSubNonReligIndoorEasing)==T ~ 99)) %>% 
  mutate_at(vars(PolicySubNonReligIndoorEaseDV), funs(na_if(., 99))) %>%
  
  
  # PolicySubNonReligIndoorEasingAtLeastOneBlackCtyDV:  
  # NA : before DateEnacted
  # 0  : on or after DateEnacted  *and* before DateSubNonReligIndoorEasingAtLeastOneBlackCty 
  # 1  : on DateSubNonReligIndoorEasingAtLeastOneBlackCty
  # NA : after DateSubNonReligIndoorEasingAtLeastOneBlackCty
  mutate(PolicySubNonReligIndoorEasingAtLeastOneBlackCtyDV =   
           case_when(PolicyDateDummyAfter_DateEnacted ==0 ~ 99,
                     PolicyDateDummyAfter_DateEnacted==1&
                     PolicyDateDummyAfter_DateSubNonReligIndoorEasingAtLeastOneBlackCty==0 ~ 0,
                     PolicyDateDummy_DateSubNonReligIndoorEasingAtLeastOneBlackCty==1 ~ 1,
                     is.na(PolicyDateNAAfter_DateSubNonReligIndoorEasingAtLeastOneBlackCty)==T ~ 99)) %>% 
  mutate_at(vars(PolicySubNonReligIndoorEasingAtLeastOneBlackCtyDV), funs(na_if(., 99))) %>% 
  
  select(state_name, date:StatePostal, 
         PolicyDateDummyAfter_DateSubNonReligIndoorEasing,
         PolicyFirstSubEaseDV,
         PolicySubNonReligIndoorEaseDV,
         PolicySubNonReligIndoorEasingAtLeastOneBlackCtyDV) %>% 
  
  rename(StateName = state_name) %>% 
  
  mutate(DateFactor = as.factor(date_num)) %>% 
  

  # Add Substate policy Dummy variables
  left_join(
    StatePolicyTable %>% 
      filter(DateType != "DateEnacted") %>%
      select(-StatePostal, -PolicyDate) %>% 
      pivot_wider(names_from = DateType, 
                  names_glue = "SubstatePolicy{DateType}",
                  values_from = SubstatePolicyDummy),
    by = c("StateName", "StatePolicy")
    ) %>% 
  mutate_at(vars(SubstatePolicyDateSub1stEasing:SubstatePolicyDateSubNonReligIndoorEasingAtLeastOneBlackCty),
                ~ replace_na(. , 0)) %>% 
  rename(PolicyType = StatePolicy,
         PolicyDateDummyAfterSubNonReligIndoorEasing = PolicyDateDummyAfter_DateSubNonReligIndoorEasing)


#################################################################
## Make contiguous neighbor diffusion matrix-- avg policy easing

neighborData <- 
  read.csv(paste(rowdata.folder,"Diffusion Neighboring States.csv", sep="/"),
           stringsAsFactors=FALSE)

lStates <- state.name #unique(PanelDF$State)

neighbors <- list()
for (i in 1:length(lStates)) {
  curNeigh <- as.character(neighborData[neighborData$state==lStates[i], 2:8])
  curNeigh <- curNeigh[curNeigh!=""]
  neighbors[[lStates[i]]] <- curNeigh
}

## Start with zeros as missing cases are no neighbors, 
## so variables should be true zeros

StatePolicyDF$PctNeighSubNonReligIndoorEasing <- 
  rep(0, nrow(StatePolicyDF))
#################################################################
## make Desmarais et al diffusion mattrix -- avg policy easing

spidData  <- 
  read.csv(paste(rowdata.folder,"SPID_v1.0_network.csv",sep="/"),
           stringsAsFactors=FALSE)


spidData2011 <- spidData[spidData$year==2011,1:2]
spidData2012 <- spidData[spidData$year==2012,1:2]
spidData2013 <- spidData[spidData$year==2013,1:2]
spidData2014 <- spidData[spidData$year==2014,1:2]
spidData2015 <- spidData[spidData$year==2015,1:2]

spid2011 <- spid2012 <- spid2013 <- spid2014 <- spid2015 <- list()

for (i in 1:length(lStates)) {
  spid2011[[lStates[i]]] <- spidData2011[spidData2011$destination_statenam==lStates[i],"origin_statenam"]
  spid2012[[lStates[i]]] <- spidData2012[spidData2012$destination_statenam==lStates[i],"origin_statenam"]
  spid2013[[lStates[i]]] <- spidData2013[spidData2013$destination_statenam==lStates[i],"origin_statenam"]
  spid2014[[lStates[i]]] <- spidData2014[spidData2014$destination_statenam==lStates[i],"origin_statenam"]
  spid2015[[lStates[i]]] <- spidData2015[spidData2015$destination_statenam==lStates[i],"origin_statenam"]
}


StatePolicyDF$PctAltSubNonReligIndoorEasing <- 
  rep(NA, nrow(StatePolicyDF))


###########################################################
## Caution!! This takes long 
###########################################################

##  Geographic diffusion 
##  loop and insert each vars 
for (i in 1:nrow(StatePolicyDF)) {
  curState <- StatePolicyDF$StateName[i]
  curDay <- StatePolicyDF$DateFactor[i]
  curPolicy <- StatePolicyDF$PolicyType [i]
  curSenders <- neighbors[[curState]]
  if (length(curSenders)>0) {
    getRows <- 
      (StatePolicyDF$StateName %in% curSenders)&(StatePolicyDF$DateFactor==curDay)&(StatePolicyDF$PolicyType==curPolicy)

    StatePolicyDF$PctNeighSubNonReligIndoorEasing[i] <- 
      100*mean(StatePolicyDF$PolicyDateDummyAfterSubNonReligIndoorEasing[getRows])

  }
}

###############################################
##  Peer state diffusion 
##  loop and insert each vars 

for (i in 1:nrow(StatePolicyDF)) {
  curState <- StatePolicyDF$StateName[i]
  curDay <- StatePolicyDF$DateFactor[i]
  curPolicy <- StatePolicyDF$PolicyType[i]
  curSenders <- c(spid2011[[curState]], spid2012[[curState]], spid2013[[curState]],
                  spid2014[[curState]], spid2015[[curState]])
  curWeights <- as.numeric(table(curSenders))/5
  curSenders <- names(table(curSenders))
  
  getRows <- 
    (StatePolicyDF$StateName %in% curSenders)&(StatePolicyDF$DateFactor==curDay)&(StatePolicyDF$PolicyType==curPolicy)
  
  
  matchedStates <- StatePolicyDF$StateName[getRows]
  matchedWeights <- rep(NA, length(matchedStates))
  for (j in 1:length(matchedStates)) {
    matchedWeights[j] <- curWeights[match(matchedStates[j], curSenders)]
  }
  
  StatePolicyDF$PctAltSubNonReligIndoorEasing[i] <- 
    100*weighted.mean(StatePolicyDF$PolicyDateDummyAfterSubNonReligIndoorEasing[getRows], matchedWeights)
  
}


#######################################################################################################
# Wrangling Covariates data
#######################################################################################################


#########################################################
## Read csv: Population &  distribution 
## Date source:: Institute for Health Metrics and Evaluation, 2017
## gbd2017_statepopest_2017.csv
#########################################################

## Read and wrangle population data (number)
df.pop0 <- 
  read.csv(paste(rowdata.folder, "gbd2017_statepopest_2017.csv", sep="/"),
           header=TRUE, na.strings=".", stringsAsFactors = FALSE, as.is=T) %>% 
  filter(sex_name == "Both") %>% 
  select(state_name, age_group_name, val) %>% 
  rename(state = state_name,
         group = age_group_name) %>% 
  mutate(group = map_chr(group, age_bracket_fun)) %>%   # Change variable name by function
  filter(group != "delete") 

# Total population only
df.pop <- 
  df.pop0 %>% 
  filter(group == "Pop") %>% 
  rename(Pop = val) %>% 
  select(-group) %>% 
  mutate(PopMillion = Pop/1000000)



## Percentage of population by age group
df.pop.pct <- 
  df.pop0 %>% 
  mutate(PctPop = paste0("Pct", group)) %>% 
  left_join(df.pop %>% 
              select(-PopMillion), 
            by ="state") %>% 
  
  mutate(Pct = (val/Pop)*100) %>%     #Calculate percentage 
  select(state, PctPop, Pct) %>% 
  pivot_wider(names_from = PctPop,
              values_from = Pct) %>% 
  select(-PctPop) %>%             #Delete Total/Total 
  mutate(PctElderly = 
           PctPop70to74 + PctPop75to79 + PctPop80to84 +
           PctPop85to89 + PctPop90to94 + PctPopOver95) %>% 
  select(state, PctElderly)


#########################################################
## Read csv of Population density by Census 2010
## 2010 Census: Apportionment Data Map

## Original data source:
## https://www.census.gov/2010census/data/apportionment-data-map.html

## Data file: pop_density.csv

#########################################################

df.popdens <- 
  read.csv(paste(rowdata.folder, "pop_density.csv", sep="/"),
           skip = 3,
           header=TRUE, na.strings=".", stringsAsFactors = FALSE, as.is=F)  %>% 
  rename(state = STATE_OR_REGION,
         pop2010 = X2010_POPULATION,
         density2010 = X2010_DENSITY) %>% 
  select(state, pop2010, density2010) %>% 
  filter(state %in% state.name) %>% 
  mutate(density2010 =  gsub(",", "", as.character(density2010)) %>% as.numeric(),
         AreaSqMile = (pop2010*density2010))
  

#########################################################
## Read csv of Urbanization 2010
## Urban Percentage of the Population for States, Historical
## ## Date source:: IOWA State Univ
## ## https://www.icip.iastate.edu/tables/population/urban-pct-counties

## Original data source:Decennial Census, U.S. Census Bureau
## pop-urban-pct-historical.xls
#########################################################

df.urban <- 
  read_excel(paste(rowdata.folder, "pop-urban-pct-historical.xls", sep="/"),
             sheet = "States", 
             skip = 5)%>% 
  rename(state = `Area Name`,
         UrbanPct = `2010`) %>%
  filter(state %in% state.name) %>% 
  select(state, UrbanPct)



#########################################################
## Read csv of Race and ethnicity distribution
## Date source:: Census

## Original data source:
## https://www.census.gov/cps/data/cpstablecreator.html

## RaceCensus2018.csv
#########################################################


df.race <- 
  read.csv(paste(rowdata.folder, "RaceCensus2018.csv", sep="/"),
           header=TRUE, na.strings=".", stringsAsFactors = FALSE, as.is=F)  %>% 
  mutate(state = map_chr(state, abb2name)) %>% 
  select(state, PctHispanic, PctBlack)



#########################################################
## Read csv of political variables
## The National Conference of State Legislators 2020
## Data file: state party control 2020.xlsx
#########################################################

df.poli <- 
  read_excel(paste(rowdata.folder, "state party control 2020.xlsx", sep="/"),
             skip = 2,
             col_names = T) %>% 
  rename(state = STATE) %>% 
  mutate(DemGovernor = case_when(Gov == 0 ~ 1,  # 0 = Dem, 1 = Rep, 2 = Divided
                                 Gov == "N/A" ~ NA_real_,  #Nebraska
                                 TRUE ~ 0)) %>% 
  select(state, DemGovernor)


#########################################################
## Read csv of Trump Vote share
## Date source:: NYT
## Data file:: GovernorPartyandTrumpVoteShare.xlsx
#########################################################

df.trump <- 
  read_excel(paste(rowdata.folder, "GovernorPartyandTrumpVoteShare.xlsx", sep="/"),
             col_names = T) %>% 
  rename(TrumpVote2016 = TrumpVoteShare) %>% 
  mutate(state = map_chr(state,abb2name)) %>% 
  select(state, TrumpVote2016)



#########################################################
## Read csv of Gross state product
## Date source:: KFF
## Original data::U.S. Bureau of Economic Analysis (BEA), 
## Annual Gross Domestic Product in current dollars by State, updated November 14, 2018.

## Data file: KFF_GSP.csv
#########################################################

df.GSP <- 
  read.csv(paste(rowdata.folder, "KFF_GSP.csv", sep="/"),
           skip= 2,
           header=TRUE, na.strings=".", stringsAsFactors = FALSE, as.is=T) %>% 
  rename(state = Location,
         GSP = Total.Gross.State.Product) %>% 
  filter(GSP !="",
         state != "United States") %>% 
  mutate(GSP = str_extract(GSP,"[:digit:]+" ) %>% as.numeric())


#########################################################
## Read csv of poverty rate
## Date source: Census 2018
## https://www.census.gov/topics/income-poverty/poverty/data/data-tools.html
## Data file: PovertyRate.csv
#########################################################

df.poverty <- 
  read.csv(paste(rowdata.folder, "PovertyRate.csv", sep="/")) %>% 
  rename(state = State...County.Name,
         id = County.ID, 
         PovertyPct = All.Ages.in.Poverty.Percent) %>% 
  filter(state %in% state.name) %>% 
  select(state, PovertyPct) 

#########################################################
## Read csv of Sales tax as % of state revenue 
## Date source: UrbanBrookings Tax Policy Center, 2017
## Data file: Sales tax as % of state revenue from Urban Institute.csv

#########################################################

df.saletax <- 
  read.csv(paste(rowdata.folder, "Sales tax as % of state revenue from Urban Institute.csv", sep="/"),
           header=TRUE, na.strings=".", stringsAsFactors = FALSE, as.is=T) %>% 
  rename(state = state,
         SalesTaxRevenueShare = Revenue.sales.tax.2017) %>% 
  mutate(state = map_chr(state,abb2name))




#########################################################
## Read csv of initial claims for unemployment insurance

## Date source: UNITED STATES DEPARTMENT OF LABOR
## https://oui.doleta.gov/unemploy/claims.asp
## https://oui.doleta.gov/unemploy/archive.asp  

## JobLessClaim.csv
#########################################################

# AvailableDate : The date when the data is in public

df.JoblessClaim0 <- 
  read.csv(paste(rowdata.folder, "JobLessClaim.csv", sep="/"),
           header=TRUE, na.strings=".", stringsAsFactors = FALSE, as.is=T) %>% 
  select(state, InsuredUnemploymentRate, AvailableDate) %>% 
  group_by(state) %>% 
  mutate(InsuredUnempRate4WeeksMAVG = Create4DaysMAVG(InsuredUnemploymentRate)) %>% 
  select(-InsuredUnemploymentRate)
  

df.JoblessClaim <- 
  dateDF %>% 
  mutate(Wday = wday(date),
         AvailableDate = case_when(Wday ==5 ~ date,
                                   Wday ==6 ~ date-1,
                                   Wday ==7 ~ date-2,
                                   Wday ==1 ~ date-3,
                                   Wday ==2 ~ date-4,
                                   Wday ==3 ~ date-5,
                                   Wday ==4 ~ date-6)) %>%
  mutate(AvailableDate = paste0(str_sub(AvailableDate, start = 1L, end = 4L),
                                str_sub(AvailableDate, start = 6L, end = 7L),
                                str_sub(AvailableDate, start = -2L, end = -1L)) %>% 
           as.numeric()) %>%
  select(date_num, AvailableDate) %>% 
  merge(state.name) %>% 
  rename(state = y) %>% 
  left_join(df.JoblessClaim0,
            by = c("state", "AvailableDate")) %>% 
  select(-AvailableDate)



#########################################################
## Read csv: Tourism empolyment share
## Data source: Tourism Generates Billions in Tax Revenue for States.
## The Council for State Governments. 
## https://knowledgecenter.csg.org/kc/content/tourismgenerates-billions-tax-revenue-states

## Data file: Council of State Governments Tourism data (2015).xlsx
#########################################################

df.tour <- 
  read_excel(paste(rowdata.folder, "Council of State Governments Tourism data (2015).xlsx", sep="/")) %>% 
  rename(TourismEmpPct =  `percent employment in tourism`) %>% 
  mutate(state = map_chr(state,abb2name))


#########################################################
## Read csv of Education
## Institute for Health Metrics and Evaluation, 2017

## Data file: Ed Attainment, American Communities, 2017.csv
#########################################################

df.edu <- 
  read.csv(paste(rowdata.folder, "Ed Attainment, American Communities, 2017.csv", sep="/"),
           header=TRUE, na.strings=".", stringsAsFactors = FALSE, as.is=T) %>% 
  rename(state =  Geography) %>% 
  mutate(AtLeastCollege = CollegeGrad + GradDegree) %>% 
  select(state, AtLeastCollege)


#########################################################
## Read csv: Ideology Fording
## Date source: Richard C. Fording
## https://rcfording.com/state-ideology-data/

## stateideology_v2018.dta
#########################################################

df.ideologyFording <- 
  read.dta13(paste(rowdata.folder, "stateideology_v2018.dta", sep="/")) %>% 
  select(-state) %>% 
  rename(state =  statename,
         IdeoFordCiti = citi6016,
         IdeoFordNomi = inst6017_nom) %>% 
  mutate(IdeoFordCiti = lag(IdeoFordCiti)) %>%  # Citizen for 2016
  filter(year == 2017) %>%  
  select(-year, -IdeoFordNomi)


############################################################################
############################################################################
##  Merge all data
############################################################################

mData <- 
  
  # State policy
  StatePolicyDF %>% 
  rename(state = StateName) %>% 
  select(-PolicyDateDummyAfterSubNonReligIndoorEasing) %>% 
   
  # Covid data
  left_join(
    covidDFCox,     #  From 01-01-COVID.R
    by = c("state", "date_num")) %>% 
  
  
  left_join(
    df.pop,     # Pop, PopMillion
    by = "state") %>% 

  left_join(
    df.pop.pct,     # PctElderly
    by = "state") %>% 
  
  
  left_join(
    df.popdens,     
    by = "state") %>% 
   mutate(PopDensity  = AreaSqMile/(Pop)) %>%  # Population density 2017
   select(-pop2010, -density2010, -AreaSqMile) %>% 
  
  left_join(
    df.urban,     # UbanPct
    by = "state") %>% 
  
  
  left_join(
    df.race,     # Race variables
    by = "state") %>% 
  

  left_join(
    df.poli,         # DemGovernor
    by = "state") %>% 
  
  
  left_join(
    df.trump,         # Trump vote share
    by = "state") %>% 
  
  
  left_join(
    df.saletax,         # Sales Tax
    by = "state") %>%
  
  
  left_join(
    df.GSP,         # Gross State Product
    by = "state") %>%
  # GSP = million, popMillion = million -> per capita
  mutate(GSPPerCapita =  GSP/(PopMillion)) %>%   
  select(-GSP, -Pop, -PopMillion) %>% 
  
  
  left_join(
    df.poverty,         # Poverty rate
    by = "state") %>%
  
  
  left_join(
    df.JoblessClaim,        # Weekly jobless data
    by = c("state", "date_num")) %>% 
  

  left_join(
    df.edu,         # Education
    by = "state") %>%
  

  left_join(
    df.tour,         # Tourism employment
    by = "state") %>%
  
  
  left_join(
    df.ideologyFording,         # Ideology by Fording
    by = "state") %>%
  
  # Specify the states that had gubernatorial elections in 2020 according to wikipedia
  mutate(ElectYear = if_else(state %in% c("New Hampshire",
                                          "Vermont",
                                          "Delaware",
                                          "Indiana",
                                          "Missouri",
                                          "Montana",
                                          "North Carolina",
                                          "North Dakota",
                                          "Utah",
                                          "Washington",
                                          "West Virginia"),
                             1, 0)) %>% 
  
  rename(StateName = state) %>% 
  
  #Set FCT
  mutate(DaysSinceFCT = difftime(ymd(date_num), ymd("20200226"), units="days") %>% as.numeric()) %>% 
  select(-DateFactor, -date_num)

## Save analysis data
save(mData, file="AnalysisData.Rdata")


